import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from matplotlib.pyplot import plot
import seaborn as sns
%matplotlib inline
from sqlalchemy import create_engine
import psycopg2
from datetime import timedelta, date, datetime
import pandas as pd
import urllib
from sqlalchemy import create_engine
from matplotlib import style
import warnings
warnings.filterwarnings("ignore")
import os
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import PowerTransformer
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from sklearn.preprocessing import PowerTransformer
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_validate
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
import pickle
The data consider just March of 2020
cases = pd.read_csv('Casos1.csv')
cases.head()
engine = create_engine('postgresql://postgres:olimac@localhost:5432/PlatziDB')
cases.to_sql('cases', engine)
conn = psycopg2.connect('dbname=PlatziDB user=postgres host=localhost password=olimac')
cur = conn.cursor()
cur.execute('''SELECT * FROM cases;''')
DB_cases = pd.DataFrame(list(cur.fetchall()))
DB_cases.columns = ["ID_sql","ID", "date", "city", "departamento", "state", "age", "sex", "type", "procedence"]
cols_ = DB_cases.select_dtypes(include=[np.object]).columns
DB_cases[cols_] = DB_cases[cols_].apply(lambda x: x.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'))
DB_cases.head()
print("Number of cases: " + str(len(DB_cases)))
print("The timeframe considered is between " + str(min(DB_cases['date'])) + " and " + str(max(DB_cases['date'])))
DB_cases.isnull().sum()
- The top three deparments are: Bogota DC, Valle del Cauca and Antioquia with around 68% of participation over total cases in march. The following chart displays the number of cases per department sorted.
order_percent = DB_cases.departamento.value_counts(normalize = True)
print(order_percent)
plt.figure(figsize=(15,8))
sns.countplot(y="departamento", data=DB_cases, palette="Set3",order=order_percent.index.tolist())
- Yes, it seems that people who belong to Hospital, Hospital UCI and Fallecido category tend to have a higher median of age compare to people who belong to recuperado or casa category.
plt.figure(figsize=(15,8))
ax = sns.boxplot(x="state", y="age", data=DB_cases,palette="Set3")
- No, it seems that there is not a tendency between age and type of register.
plt.figure(figsize=(15,8))
ax = sns.boxplot(x="type", y="age", data=DB_cases,palette="Set3")
- It means that the proportion between F and M for 'Recuperado' and 'Casa'category is the same. A bit more for F than M. On the other hand, for 'Hospital', 'Hosputal UCI' and ' Fallecido' category the proportiion of F and M is different. There are more M then F.
plt.figure(figsize=(15,8))
ax = sns.countplot(x="state", hue="sex", data=DB_cases,palette="Set3")
- For 'Recuperado', 'Casa', 'Hospital UCI' and 'Fallecido' there is not a clear tendency between age and sex. However for 'Hospital', it seems that M tend to have a in a lower age that F in 'Hospital' category..
plt.figure(figsize=(15,8))
ax = sns.boxplot(x="state", y="age", hue="sex",data=DB_cases, palette="Set3")
- it's clear that people who are at home is increasing over the month, besides of people at the hospital who seems to decrease at the end of the month.
#se determina el tamaño de la ilustración
fig, ax = plt.subplots(figsize=(15,7))
#Finalmente se genera el conteo por año de cada estado y se grafica
DB_cases.groupby([pd.DatetimeIndex(DB_cases.date).to_period("D"),'state']).count()['ID'].unstack().plot(ax=ax)
- Seems that during march the tendency was exponential.
df = DB_cases.copy()
df=df.assign(ones=1)
df=df.sort_values(['date']).reset_index(drop=True)
df["cum_ones"]=df.groupby(['state'])['ones'].cumsum(axis=0)
depar = ['Casa']
df_casa = df[ df['state'].isin(depar) ]
fig, ax = plt.subplots(figsize=(15,7))
df_casa.groupby([pd.DatetimeIndex(df_casa.date).to_period("D"),'state']).max()['cum_ones'].unstack().plot(ax=ax)
depar = ['Casa']
df_casa = df[~df['state'].isin(depar) ]
fig, ax = plt.subplots(figsize=(15,7))
#Finalmente se genera el conteo por año de cada estado y se grafica
df_casa.groupby([pd.DatetimeIndex(df_casa.date).to_period("D"),'state']).max()['cum_ones'].unstack().plot(ax=ax)
DB_cases.columns
df_proce = DB_cases.copy()
order_percent = df_proce.type.value_counts(normalize = True)
print(order_percent)
plt.figure(figsize=(15,8))
sns.countplot(y="type", data=df_proce, palette="Set3",order=order_percent.index.tolist())
plt.figure(figsize=(15,8))
ax = sns.countplot(x="type", hue="sex", data=DB_cases,palette="Set3")
plt.figure(figsize=(15,8))
ax = sns.countplot(x="type", hue="state", data=DB_cases,palette="Set3")
depar = ['Importado']
df_proce_importado = df_proce[ df_proce['type'].isin(depar) ]
order_percent = df_proce_importado.procedence.value_counts(normalize = True)
print(order_percent)
plt.figure(figsize=(20,20))
sns.countplot(y="procedence", data=df_proce_importado, palette="Set3",order=order_percent.index.tolist())
depar = ['En estudio']
df_proce_importado = df_proce[ df_proce['type'].isin(depar) ]
order_percent = df_proce_importado.departamento.value_counts(normalize = True)
print(order_percent)
plt.figure(figsize=(20,20))
sns.countplot(y="departamento", data=df_proce_importado, palette="Set3",order=order_percent.index.tolist())
depar = ['Relacionado']
df_proce_importado = df_proce[ df_proce['type'].isin(depar) ]
order_percent = df_proce_importado.departamento.value_counts(normalize = True)
print(order_percent)
plt.figure(figsize=(20,20))
sns.countplot(y="departamento", data=df_proce_importado, palette="Set3",order=order_percent.index.tolist())
DB_cases_copy = DB_cases.copy()
depar = ['Bogota D.C.','Valle del Cauca','Antioquia']
DB_cases_copy = DB_cases_copy[ DB_cases_copy['departamento'].isin(depar) ]
order_percent_city = DB_cases_copy.city.value_counts(normalize = True)
print(order_percent_city)
plt.figure(figsize=(15,8))
sns.countplot(y="city", data=DB_cases_copy, palette="Set3", order = order_percent_city.index.tolist())
plt.figure(figsize=(15,8))
ax = sns.boxplot(x="departamento", y="age", hue="state",data=DB_cases_copy, palette="Set3")
- This is an Unsupervised model that allow you to identify groups of registers with similar features. After identifing these groups, several strategies could be created and adjusted subject the group selected. The most common method is K-means. This one works really well for numerical values but not good for mixing data. that is why I used a data transformation method in order to take into account numerical and categoriacal variables. This method is named as Factor Analysis of mixed data (FAMD).
DB_cases.columns
Data_k = DB_cases[['city','departamento','state','age','sex','type','procedence']]
In this case I created two different components and plot it
import prince
famd = prince.FAMD(
n_components=2,
n_iter=3,
copy=True,
check_input=True,
engine='auto',
random_state=42
)
famd = famd.fit(Data_k)
l = famd.row_coordinates(Data_k)
print(l[0])
sns.relplot(x=l[0], y=l[1])
principalDf = pd.DataFrame(data = l
, columns = ['FAMD_component_1','FAMD_component_2'])
principalDf.shape
principalDf['FAMD_component_1'] = l[0]
principalDf['FAMD_component_2'] = l[1]
print(principalDf)
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": 42,
}
sse = {}
for k in tqdm(range(2, 30)):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs).fit(principalDf)
sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
fig = go.Figure(data=go.Scatter(x=list(sse.keys()), y=list(sse.values())))
fig.show()
from sklearn.metrics import silhouette_score
# A list holds the silhouette coefficients for each k
silhouette_coefficients = []
# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 30):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(principalDf)
score = silhouette_score(principalDf, kmeans.labels_)
silhouette_coefficients.append(score)
plt.style.use("fivethirtyeight")
plt.plot(range(2, 30), silhouette_coefficients)
plt.xticks(range(2, 30))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()
data = principalDf
for c in data.columns:
pt = PowerTransformer()
data.loc[:, c] = pt.fit_transform(np.array(data[c]).reshape(-1, 1))
kmeans = KMeans(n_clusters=4,max_iter= 1000000).fit(data)
kmeans_labels = kmeans.labels_
pd.Series(kmeans_labels).value_counts()
lgbm_data = principalDf.copy()
lgbm_data['Clusters_k_means'] = kmeans_labels
print(lgbm_data)
print(lgbm_data.columns)
sns.relplot(x='FAMD_component_1', y='FAMD_component_2', hue='Clusters_k_means', style='Clusters_k_means',data=lgbm_data, palette="Paired")
fin_lgbm_data = DB_cases.copy()
fin_lgbm_data['Clusters_k_means'] = kmeans_labels
fin_lgbm_data.head()
- Clearly all clusters could be identified by the range of age. Cluster 1 and 3 has higher range of age than 0 and 2 cluster.
numerical_sig = ['age']
for i in numerical_sig:
plt.figure(figsize=(15,8))
ax = sns.boxplot(x="Clusters_k_means", y=i, data=fin_lgbm_data)
ax2 = sns.catplot(x="Clusters_k_means", y=i, data=fin_lgbm_data)
- As cluster 1 and 3 are really similar regarding the range of age, those could be idenfied because cluster 3 considers just 'importado' people but cluster 1 considers 'Relacionado' and 'En estudio'
- As cluster 0 and 2 are really similar regarding the range of age, Those could be identified because cluster 2 considers 'Relacionado' or ' En estudio'and cluster 0 consider monstly 'Importado'. In addition, the distribution of F and M change between cluster 0 and 2.
categorical_sig = ['state','sex','type']
for i in categorical_sig:
plt.figure(figsize=(10,4)) #this creates a new figure on which your plot will appear
sns.countplot(y="Clusters_k_means", hue=i, data=fin_lgbm_data)